In [1]:
import time
from oneflow.core.models import *
In [9]:
arts = Article.objects(url__startswith='http://da.feed')
arts.count()
Out[9]:
In [11]:
for article in arts[:10]:
article.absolutize_url()
In [ ]:
In [2]:
# content
empty_articles = Article.objects(content_type=0) # not counted
unparsed_articles = Article.objects(content_type__exists=False)
pending_articles = empty_articles.filter(content_error__exists=False)
error_articles = empty_articles.filter(content_error__exists=True)
error2_articles = Article.objects(content_error__exists=True)
html_articles = Article.objects(content_type=1)
markdown_articles = Article.objects(content_type=2)
# counts
total_articles_count = Article.objects().count()
unparsed_articles_count = unparsed_articles.count()
pending_articles_count = pending_articles.count()
error_articles_count = error_articles.count()
error2_articles_count = error2_articles.count()
html_articles_count = html_articles.count()
markdown_articles_count = markdown_articles.count()
print '- %s: CONTENT pending: %s, errors: %s/%s, md: %s, html:%s, unparsed: %s, total: %s; fetched: %s, dupes: %s' % (
time.strftime('%Y%m%d %H:%M'),
pending_articles_count,
error_articles_count, error2_articles_count,
markdown_articles_count,
html_articles_count,
unparsed_articles_count,
total_articles_count,
global_feed_stats.fetched(), global_feed_stats.dupes()
)
20130711 13:56: CONTENT pending: 26263, errors: 4710/7800, md: 67586, html:0, unparsed: 2291701, total: 2390258; fetched: 196455, dupes: 4225707
20130706 ??:??: CONTENT pending: 14967, errors: 0854, md: 33341, html:0, unparsed: 2291701, total: 2340863
In [4]:
# full_content
f_empty_articles = Article.objects(full_content_type=0) # not counted
f_unparsed_articles = Article.objects(full_content_type__exists=False)
f_pending_articles = f_empty_articles.filter(full_content_error__exists=False)
f_error_articles = f_empty_articles.filter(full_content_error__exists=True)
f_error2_articles = Article.objects(full_content_error__exists=True)
f_html_articles = Article.objects(full_content_type=1)
f_markdown_articles = Article.objects(full_content_type=2)
# counts
f_unparsed_articles_count = f_unparsed_articles.count()
f_pending_articles_count = f_pending_articles.count()
f_error_articles_count = f_error_articles.count()
f_error2_articles_count = f_error2_articles.count()
f_html_articles_count = f_html_articles.count()
f_markdown_articles_count = f_markdown_articles.count()
print '- %s: FULL_CONTENT pending: %s, errors: %s/%s, md: %s, html:%s, unparsed: %s, total: %s' % (
time.strftime('%Y%m%d %H:%M'),
f_pending_articles_count,
f_error_articles_count, f_error2_articles_count,
f_markdown_articles_count,
f_html_articles_count,
f_unparsed_articles_count,
total_articles_count)
In [12]:
# NOTE: re-enable content parsing in constance first!!
for article in pending_articles:
article.parse_content.delay()
In [5]:
# NOTE: re-enable content parsing in constance first!!
for article in f_pending_articles:
article.parse_full_content.delay()
In [12]:
article = Article.objects.get(id="51d968514adc897c6893920f")
In [20]:
print (article.content_error, article.full_content_error,
article.content, article.full_content,
article.content_type, article.full_content_type)
In [ ]: